# Remove the following two lines if you want to use other tools for plotting.
library(ggplot2) # you might have to install ggplot2 using `install.package(ggplot2)`. You can also use something else to plot.
library(ggfortify) # you might have to install ggfortify using `install.package(ggfortify)`. [autoplot](https://cran.r-project.org/web/packages/ggfortify/vignettes/plot_pca.html) might be usefule to look at the PCA results.
Follow the following steps to perform an archetype analysis of the finch data.
data/finches.csv (read.csv should do the trick).princomp is recommended.pca_result$loadings). You can also autoplot or biplot to generate a biplot of the components and loadings . What are the contributions of the features to the first two principal components?chull is recommended) and highlight it in the PCA plot.finches <- read.csv("~/Dev/r-studio-binder/data/finches.csv", header=TRUE)
pca <- princomp(finches, cor=TRUE, scores=TRUE) # cor=TRUE => scale & center
Plot the variance captured by the 5 principle componets.
pca$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## wing.length 0.459 0.110 0.364 0.772 0.221
## tarsus.length 0.449 0.234 0.639 -0.547 -0.190
## upper.mandible.length 0.428 0.668 -0.604
## upper.mandible.depth 0.452 -0.445 -0.210 -0.300 0.681
## lower.mandible.width 0.447 -0.538 -0.224 0.108 -0.670
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.2 0.2 0.2 0.2 0.2
## Cumulative Var 0.2 0.4 0.6 0.8 1.0
autoplot(pca, loadings = TRUE, loadings.label = TRUE)
hull <- chull(pca_df$Comp.1, pca_df$Comp.2)
hull
## [1] 10 80 75 94 114 113 117 127 62 130 125 60
pca_df <- data.frame(pca$scores)
ggplot(data=pca_df, aes(x=Comp.1, y=Comp.2)) +
geom_polygon(data=pca_df[hull, ], fill='white') +
geom_point(data=pca_df[hull, ], colour="red", size=5) +
geom_point()
winner <- 0
area <- 0
for (i in hull){
for (j in hull){
for (k in hull)
ax = pca_df$Comp.1[i]
ay = pca_df$Comp.2[i]
bx = pca_df$Comp.1[j]
by = pca_df$Comp.2[j]
cx = pca_df$Comp.1[k]
cy = pca_df$Comp.2[k]
area_new <- abs((ax * (by - cy) + bx * (cy - ay) + cx * (ay + by)) / 2)
if (area_new >= area) {
print(area_new)
area <- area_new
winner <- c(i, j, k)
}
}
}
## [1] 0.2920978
## [1] 1.078508
## [1] 1.763069
## [1] 5.193567
## [1] 7.029989
print("The three points in the convex hull that correspond to the triangle with largest area")
## [1] "The three points in the convex hull that correspond to the triangle with largest area"
print(winner)
## [1] 10 114 60
pca_df <- data.frame(pca$scores)
ggplot(data=pca_df, aes(x=Comp.1, y=Comp.2)) +
geom_polygon(data=pca_df[hull, ], fill='white') +
geom_point(data=pca_df[hull, ], colour="red", size=5) +
geom_path(data=pca_df[append(winner, winner[1]), ], colour="cyan") +
geom_point()
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("data/szekely-2015-data.csv", header = TRUE)
df_numeric <- df %>%
dplyr::select_if(is.numeric) %>%
dplyr::select_if(~ !any(is.na(.)))
You can also embed plots, for example: